In [8]:
import zipfile
from urllib.request import urlopen
import os

source_url = 'ftp://ftp.nhtsa.gov/GES/GES12/GES12_Flatfile.zip'
zip_name = 'GES12_Flatfile.zip'
cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
zip_path = os.path.join(dir_path, zip_name)

# We'll make a directory for you to play around with,
# then when you're done playing you can just delete the directory
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Download the file from GES website if you haven't already
if not os.path.exists(zip_path):
    response = urlopen(source_url)
    with open(zip_path, 'wb') as fh:
        x = response.read()
        fh.write(x)

# Extract all the files from that zipfile
with zipfile.ZipFile(os.path.join(dir_path, zip_name), 'r') as z:
    z.extractall(dir_path)


---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
<ipython-input-8-fae59f649239> in <module>()
     16 # Download the file from GES website if you haven't already
     17 if not os.path.exists(zip_path):
---> 18     response = urlopen(source_url)
     19     with open(zip_path, 'wb') as fh:
     20         x = response.read()

/Users/agreenhut/anaconda/lib/python3.4/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault)
    151     else:
    152         opener = _opener
--> 153     return opener.open(url, data, timeout)
    154 
    155 def install_opener(opener):

/Users/agreenhut/anaconda/lib/python3.4/urllib/request.py in open(self, fullurl, data, timeout)
    453             req = meth(req)
    454 
--> 455         response = self._open(req, data)
    456 
    457         # post-process response

/Users/agreenhut/anaconda/lib/python3.4/urllib/request.py in _open(self, req, data)
    471         protocol = req.type
    472         result = self._call_chain(self.handle_open, protocol, protocol +
--> 473                                   '_open', req)
    474         if result:
    475             return result

/Users/agreenhut/anaconda/lib/python3.4/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    431         for handler in handlers:
    432             func = getattr(handler, meth_name)
--> 433             result = func(*args)
    434             if result is not None:
    435                 return result

/Users/agreenhut/anaconda/lib/python3.4/urllib/request.py in ftp_open(self, req)
   1394             host = socket.gethostbyname(host)
   1395         except OSError as msg:
-> 1396             raise URLError(msg)
   1397         path, attrs = splitattr(req.selector)
   1398         dirs = path.split('/')

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [10]:
#See what we just unzipped
os.listdir(dir_path)


Out[10]:
['2012GESFlatFileTXT.sas',
 'ACCIDENT.TXT',
 'CEVENT.TXT',
 'DAMAGE.TXT',
 'DISTRACT.TXT',
 'DRIMPAIR.TXT',
 'FACTOR.TXT',
 'MANEUVER.TXT',
 'NMCRASH.TXT',
 'NMIMPAIR.TXT',
 'NMPRIOR.TXT',
 'PARKWORK.TXT',
 'PERSON.TXT',
 'SAFETYEQ.TXT',
 'VEHICLE.TXT',
 'VEVENT.TXT',
 'VIOLATN.TXT',
 'VISION.TXT',
 'VSOE.TXT']

In [11]:
import pandas as pd
import numpy as np
import sklearn

cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
input_file_path = os.path.join(dir_path, 'PERSON.TXT')

input_data = pd.read_csv(input_file_path, delimiter='\t')

In [12]:
sorted(input_data.columns)


Out[12]:
['AGE',
 'AGE_IM',
 'AIR_BAG',
 'ALC_RES',
 'ALC_STATUS',
 'ATST_TYP',
 'BODY_TYP',
 'CASENUM',
 'DRINKING',
 'DRUGRES1',
 'DRUGRES2',
 'DRUGRES3',
 'DRUGS',
 'DRUGTST1',
 'DRUGTST2',
 'DRUGTST3',
 'DSTATUS',
 'EJECTION',
 'EJECT_IM',
 'EMER_USE',
 'FIRE_EXP',
 'HARM_EV',
 'HOSPITAL',
 'HOUR',
 'IMPACT1',
 'INJSEV_IM',
 'INJ_SEV',
 'LOCATION',
 'MAKE',
 'MAN_COLL',
 'MINUTE',
 'MOD_YEAR',
 'MONTH',
 'PERALCH_IM',
 'PER_NO',
 'PER_TYP',
 'PJ',
 'PSU',
 'PSUSTRAT',
 'P_SF1',
 'P_SF2',
 'P_SF3',
 'REGION',
 'REST_MIS',
 'REST_USE',
 'ROLLOVER',
 'SCH_BUS',
 'SEAT_IM',
 'SEAT_POS',
 'SEX',
 'SEX_IM',
 'SPEC_USE',
 'STRATUM',
 'STR_VEH',
 'TOW_VEH',
 'VEH_NO',
 'VE_FORMS',
 'WEIGHT']

In [13]:
input_data.INJSEV_IM.value_counts()


Out[13]:
0    100840
2     20758
1     19380
3      9738
5      1179
4      1178
6         4
dtype: int64

In [15]:
# Drop those odd cases
input_data = input_data[input_data.INJSEV_IM != 6]

for column_name in input_data.columns:
    n_nans = input_data[column_name].isnull().sum()
    if n_nans > 0:
        print (column_name, n_nans)


MAKE 5162
BODY_TYP 5162
MOD_YEAR 5162
TOW_VEH 5162
SPEC_USE 5162
EMER_USE 5162
ROLLOVER 5162
IMPACT1 5162
FIRE_EXP 5162

In [21]:
print (input_data.shape)
data = input_data[~input_data.MAKE.isnull()]
discarded = data.pop('INJ_SEV')
target = data.pop('INJSEV_IM')
print (data.shape)


(153073, 58)
(147911, 56)

In [22]:
target = (target == 4).astype('float')

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

# Train on half of the data while reserving the other half for
# model comparisons
xtrain, xtest, ytrain, ytest = sklearn.cross_validation.train_test_split(
    data.values, target.values, train_size=0.5)

linreg = LinearRegression()
linreg.fit(xtrain, ytrain)

lr_preds = linreg.predict(xtest)
lr_perf = roc_auc_score(ytest, lr_preds)
print ('OLS: Area under the ROC curve = {}'.format(lr_perf))


OLS: Area under the ROC curve = 0.9304890372366488

In [27]:
from sklearn.linear_model import Ridge

ridge = GridSearchCV(Ridge(),
                     {'alpha': np.logspace(-10, 10, 10)})
ridge.fit(xtrain, ytrain)
ridge_preds = ridge.predict(xtest)
ridge_performance = roc_auc_score(ytest, ridge_preds)
print ('Ridge: Area under the ROC curve = {}'.format(ridge_performance))


Ridge: Area under the ROC curve = 0.9305915934001125

In [29]:
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

lasso = GridSearchCV(Lasso(),
                     {'alpha': np.logspace(-10, -8, 5)})
lasso.fit(xtrain, ytrain)
lasso_preds = lasso.predict(xtest)
lasso_performance = roc_auc_score(ytest, lasso_preds)
print ('Lasso: Area under the ROC curve = {}'.format(lasso_performance))


Lasso: Area under the ROC curve = 0.9304885977507396
/Users/agreenhut/anaconda/lib/python3.4/site-packages/sklearn/linear_model/coordinate_descent.py:490: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations
  ConvergenceWarning)

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

gbm = GradientBoostingClassifier(n_estimators=500)

gbm.fit(xtrain, ytrain)
gbm_preds = gbm.predict_proba(xtest)[:, 1]
gbm_performance = roc_auc_score(ytest, gbm_preds)

print ('GBM: Area under the ROC curve = {}'.format(gbm_performance))


GBM: Area under the ROC curve = 0.9680149385513637

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

tree = GridSearchCV(DecisionTreeClassifier(),
                    {'max_depth': np.arange(3, 10)})

tree.fit(xtrain, ytrain)
tree_preds = tree.predict_proba(xtest)[:, 1]
tree_performance = roc_auc_score(ytest, tree_preds)

print ('DecisionTree: Area under the ROC curve = {}'.format(tree_performance))


DecisionTree: Area under the ROC curve = 0.9396437839112999

In [33]:
importances = pd.Series(gbm.feature_importances_, index=data.columns)
print (importances.order(ascending=False)[:10])


STRATUM     0.106238
EJECT_IM    0.075484
WEIGHT      0.063853
DRINKING    0.053987
HOUR        0.045015
AGE_IM      0.036889
ALC_RES     0.036123
FIRE_EXP    0.034666
HOSPITAL    0.032212
IMPACT1     0.031792
dtype: float64

In [ ]: